Understand The data¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import pickle
sns.set()
credit_data=pd.read_csv('../data/credit.csv')
print('Dimensionality before deleting duplicate values:', credit_data.shape)

# Removing duplicate rows if any
credit_data=credit_data.drop_duplicates()
print('Dimensionality After deleting duplicate values:', credit_data.shape)

# Printing sample data
# Start observing the Quantitative/Categorical/Qualitative variables
credit_data.head(10)
Dimensionality before deleting duplicate values: (1000, 17)
Dimensionality After deleting duplicate values: (1000, 17)
Out[1]:
checking_balance months_loan_duration credit_history purpose amount savings_balance employment_duration percent_of_income years_at_residence age other_credit housing existing_loans_count job dependents phone default
0 < 0 DM 6 critical furniture/appliances 1169 unknown > 7 years 4 4 67 none own 2 skilled 1 yes no
1 1 - 200 DM 48 good furniture/appliances 5951 < 100 DM 1 - 4 years 2 2 22 none own 1 skilled 1 no yes
2 unknown 12 critical education 2096 < 100 DM 4 - 7 years 2 3 49 none own 1 unskilled 2 no no
3 < 0 DM 42 good furniture/appliances 7882 < 100 DM 4 - 7 years 2 4 45 none other 1 skilled 2 no no
4 < 0 DM 24 poor car 4870 < 100 DM 1 - 4 years 3 4 53 none other 2 skilled 2 no yes
5 unknown 36 good education 9055 unknown 1 - 4 years 2 4 35 none other 1 unskilled 2 yes no
6 unknown 24 good furniture/appliances 2835 500 - 1000 DM > 7 years 3 4 53 none own 1 skilled 1 no no
7 1 - 200 DM 36 good car 6948 < 100 DM 1 - 4 years 2 2 35 none rent 1 management 1 yes no
8 unknown 12 good furniture/appliances 3059 > 1000 DM 4 - 7 years 2 4 61 none own 1 unskilled 1 no no
9 1 - 200 DM 30 critical car 5234 < 100 DM unemployed 4 2 28 none own 2 management 1 no yes
In [2]:
credit_data.columns
Out[2]:
Index(['checking_balance', 'months_loan_duration', 'credit_history', 'purpose',
       'amount', 'savings_balance', 'employment_duration', 'percent_of_income',
       'years_at_residence', 'age', 'other_credit', 'housing',
       'existing_loans_count', 'job', 'dependents', 'phone', 'default'],
      dtype='object')

Looking at the distribution of Target variable¶

In [3]:
good_bad_per=round(((credit_data.default.value_counts()/credit_data.default.count())*100))
good_bad_per
plt.pie(good_bad_per,labels=['Good loans', 'Bad loans'], autopct='%1.0f%%', startangle=90)
plt.title('Percentage of good and bad loans');
plt.savefig('../results/Percentage_of_good_and_bad_loans.png')

Basic Data Exploration¶

In [4]:
# Observing the summarized information of data
# Data types, Missing values based on number of non-null values Vs total rows etc.
# Remove those variables from data which have too many missing values (Missing Values > 30%)
# Remove Qualitative variables which cannot be used in Machine Learning
credit_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   object
 16  default               1000 non-null   object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB
In [5]:
# Looking at the descriptive statistics of the data
credit_data.describe(include='all')
Out[5]:
checking_balance months_loan_duration credit_history purpose amount savings_balance employment_duration percent_of_income years_at_residence age other_credit housing existing_loans_count job dependents phone default
count 1000 1000.000000 1000 1000 1000.000000 1000 1000 1000.000000 1000.000000 1000.000000 1000 1000 1000.000000 1000 1000.000000 1000 1000
unique 4 NaN 5 6 NaN 5 5 NaN NaN NaN 3 3 NaN 4 NaN 2 2
top unknown NaN good furniture/appliances NaN < 100 DM 1 - 4 years NaN NaN NaN none own NaN skilled NaN no no
freq 394 NaN 530 473 NaN 603 339 NaN NaN NaN 814 713 NaN 630 NaN 596 700
mean NaN 20.903000 NaN NaN 3271.258000 NaN NaN 2.973000 2.845000 35.546000 NaN NaN 1.407000 NaN 1.155000 NaN NaN
std NaN 12.058814 NaN NaN 2822.736876 NaN NaN 1.118715 1.103718 11.375469 NaN NaN 0.577654 NaN 0.362086 NaN NaN
min NaN 4.000000 NaN NaN 250.000000 NaN NaN 1.000000 1.000000 19.000000 NaN NaN 1.000000 NaN 1.000000 NaN NaN
25% NaN 12.000000 NaN NaN 1365.500000 NaN NaN 2.000000 2.000000 27.000000 NaN NaN 1.000000 NaN 1.000000 NaN NaN
50% NaN 18.000000 NaN NaN 2319.500000 NaN NaN 3.000000 3.000000 33.000000 NaN NaN 1.000000 NaN 1.000000 NaN NaN
75% NaN 24.000000 NaN NaN 3972.250000 NaN NaN 4.000000 4.000000 42.000000 NaN NaN 2.000000 NaN 1.000000 NaN NaN
max NaN 72.000000 NaN NaN 18424.000000 NaN NaN 4.000000 4.000000 75.000000 NaN NaN 4.000000 NaN 2.000000 NaN NaN
In [6]:
# Finging unique values for each column
# TO understand which column is categorical and which one is Continuous
# Typically if the numer of unique values are < 20 then the variable is likely to be a category otherwise continuous
credit_data.nunique()
Out[6]:
checking_balance          4
months_loan_duration     33
credit_history            5
purpose                   6
amount                  921
savings_balance           5
employment_duration       5
percent_of_income         4
years_at_residence        4
age                      53
other_credit              3
housing                   3
existing_loans_count      4
job                       4
dependents                2
phone                     2
default                   2
dtype: int64

Visual Exploratory Data Analysis¶

Categorical variables: Bar plot

In [7]:
def PlotBarCharts(inpData, colsToPlot):    
    # Generating multiple subplots
    fig, subPlot=plt.subplots(nrows=3, ncols=5, figsize=(20,20))
    fig.suptitle('Bar charts of: '+ str(colsToPlot))
    plt.subplots_adjust(left=0.1,
                        bottom=0.1,
                        right=0.9,
                        top=0.9,
                        wspace=0.4,
                        hspace=0.4)
    for colName, plotNumber in zip(colsToPlot, range(len(colsToPlot))):
        row_num = int(plotNumber/5)
        column_num = int(plotNumber%5)
        subplt = sns.countplot(x=inpData[colName],ax=subPlot[row_num][column_num])
        subplt.set_xticklabels(subplt.get_xticklabels(), rotation=40, ha="right")
    plt.delaxes(subPlot[-1][-1])
    plt.delaxes(subPlot[-1][-2])
    plt.tight_layout()
    plt.savefig('../results/visual_exploratory_data_analysis_categorical.png')
In [8]:
#####################################################################
# Calling the function for bar chart
PlotBarCharts(inpData=credit_data, 
              colsToPlot=['checking_balance', 'credit_history', 'purpose','savings_balance','employment_duration','percent_of_income', 'years_at_residence', 'other_credit','housing','existing_loans_count','job', 'dependents', 'phone'])

Continuous variables: Histogram

In [9]:
def PlotHistCharts(inpData, colsToPlot):    
    # Generating multiple subplots
    fig, subPlot=plt.subplots(1,3, figsize=(20,5))
    fig.suptitle('Histogram charts of: '+ str(colsToPlot))
    for colName, plotNumber in zip(colsToPlot, range(len(colsToPlot))):
        subplt = sns.histplot(data=inpData, x=colName, kde=True,ax=subPlot[plotNumber])
    plt.tight_layout()
    plt.savefig('../results/visual_exploratory_data_analysis_continuous.png')
In [10]:
#####################################################################
# Calling the function for histogram chart
PlotHistCharts(inpData=credit_data, 
              colsToPlot=['age','amount','months_loan_duration'])

Handling outliers (unusually extreme data points) and Managing missing values in the dataset

In [11]:
credit_data.isnull().sum()
Out[11]:
checking_balance        0
months_loan_duration    0
credit_history          0
purpose                 0
amount                  0
savings_balance         0
employment_duration     0
percent_of_income       0
years_at_residence      0
age                     0
other_credit            0
housing                 0
existing_loans_count    0
job                     0
dependents              0
phone                   0
default                 0
dtype: int64

No missing values in this data!

Relationship exploration: Categorical Vs Continuous -- Box Plots

In [12]:
# Box plots for Categorical Target Variable "default" and continuous predictors
continuous_cols_list=['age','amount', 'months_loan_duration']
fig, plot_canvas=plt.subplots(nrows=1, ncols=len(continuous_cols_list), figsize=(18,5))

# Creating box plots for each continuous predictor against the Target Variable "default"
for predictor_col , i in zip(continuous_cols_list, range(len(continuous_cols_list))):
    sns.boxplot(data=credit_data, x=predictor_col, y="default",ax=plot_canvas[i])
plt.savefig('../results/relationship_exploration_continuous_vs_categorical.png')
In [13]:
# Defining a function to find the statistical relationship with all the categorical variables
def FunctionAnova(inpData, TargetVariable, ContinuousPredictorList):
    from scipy.stats import f_oneway

    # Creating an empty list of final selected predictors
    SelectedPredictors=[]
    
    print('##### ANOVA Results ##### \n')
    for predictor in ContinuousPredictorList:
        CategoryGroupLists=inpData.groupby(TargetVariable)[predictor].apply(list)
        AnovaResults = f_oneway(*CategoryGroupLists)
        
        # If the ANOVA P-Value is <0.05, that means we reject H0
        if (AnovaResults[1] < 0.05):
            print(predictor, 'is correlated with', TargetVariable, '| P-Value:', AnovaResults[1])
            SelectedPredictors.append(predictor)
        else:
            print(predictor, 'is NOT correlated with', TargetVariable, '| P-Value:', AnovaResults[1])
    
    return(SelectedPredictors)
In [14]:
# Calling the function to check which categorical variables are correlated with target
continuous_variables=['age', 'amount','months_loan_duration']
FunctionAnova(inpData=credit_data, TargetVariable='default', ContinuousPredictorList=continuous_variables)
##### ANOVA Results ##### 

age is correlated with default | P-Value: 0.003925339398278295
amount is correlated with default | P-Value: 8.797572373533373e-07
months_loan_duration is correlated with default | P-Value: 6.488049877187189e-12
Out[14]:
['age', 'amount', 'months_loan_duration']

Relationship exploration: Categorical Vs Categorical -- Grouped Bar Charts¶

When the target variable is Categorical and the predictor is also Categorical then we explore the correlation between them visually using barplots and statistically using Chi-square test

In [15]:
def PlotCrossTabCharts(inpData, colsToPlot):    
    # Generating multiple subplots
    fig, subPlot=plt.subplots(nrows=3, ncols=5, figsize=(20,20))
    fig.suptitle('Bar charts of: '+ str(colsToPlot))
    plt.subplots_adjust(left=0.1,
                        bottom=0.1,
                        right=0.9,
                        top=0.9,
                        wspace=0.4,
                        hspace=0.4)
    for colName, plotNumber in zip(colsToPlot, range(len(colsToPlot))):
        row_num = int(plotNumber/5)
        col_num = int(plotNumber%5)
        CrossTabResult=pd.crosstab(index=inpData[colName], columns=inpData['default'])
        subplt = CrossTabResult.plot(kind="bar", stacked=True, rot=0,ax=subPlot[row_num][col_num])
        subplt.set_xticklabels(subplt.get_xticklabels(), rotation=40, ha="right")
    plt.delaxes(subPlot[-1][-1])
    plt.delaxes(subPlot[-1][-2])
    plt.savefig('../results/relationship_exploration_categorical_vs_categorical.png')
In [16]:
#####################################################################
# Calling the function for bar chart on CrossTab
PlotCrossTabCharts(inpData=credit_data, 
              colsToPlot=['checking_balance', 'credit_history', 'purpose','savings_balance','employment_duration','percent_of_income', 'years_at_residence', 'other_credit','housing','existing_loans_count','job', 'dependents', 'phone'])
In [17]:
# Writing a function to find the correlation of all categorical variables with the Target variable
def FunctionChisq(inpData, TargetVariable, CategoricalVariablesList):
    from scipy.stats import chi2_contingency
    
    # Creating an empty list of final selected predictors
    SelectedPredictors=[]

    for predictor in CategoricalVariablesList:
        CrossTabResult=pd.crosstab(index=inpData[TargetVariable], columns=inpData[predictor])
        ChiSqResult = chi2_contingency(CrossTabResult)
        
        # If the ChiSq P-Value is <0.05, that means we reject H0
        if (ChiSqResult[1] < 0.05):
            print(predictor, 'is correlated with', TargetVariable, '| P-Value:', ChiSqResult[1])
            SelectedPredictors.append(predictor)
        else:
            print(predictor, 'is NOT correlated with', TargetVariable, '| P-Value:', ChiSqResult[1])        
            
    return(SelectedPredictors)
In [18]:
# Calling the function
CategoricalVariables=['checking_balance', 'credit_history', 'purpose','savings_balance','employment_duration',
                     'percent_of_income', 'years_at_residence', 'other_credit','housing', 'existing_loans_count', 'job', 'dependents', 'phone']

FunctionChisq(inpData=credit_data, 
              TargetVariable='default',
              CategoricalVariablesList= CategoricalVariables)
checking_balance is correlated with default | P-Value: 1.2189020722893755e-26
credit_history is correlated with default | P-Value: 1.2791872956751013e-12
purpose is NOT correlated with default | P-Value: 0.14488884731934595
savings_balance is correlated with default | P-Value: 2.7612142385682596e-07
employment_duration is correlated with default | P-Value: 0.0010454523491402541
percent_of_income is NOT correlated with default | P-Value: 0.1400333122128481
years_at_residence is NOT correlated with default | P-Value: 0.8615521320413175
other_credit is correlated with default | P-Value: 0.0016293178186473534
housing is correlated with default | P-Value: 0.00011167465374597664
existing_loans_count is NOT correlated with default | P-Value: 0.4451440800083001
job is NOT correlated with default | P-Value: 0.5965815918843431
dependents is NOT correlated with default | P-Value: 1.0
phone is NOT correlated with default | P-Value: 0.27887615430357426
Out[18]:
['checking_balance',
 'credit_history',
 'savings_balance',
 'employment_duration',
 'other_credit',
 'housing']

Based on the results of Chi-Square test, below categorical columns are selected as predictors for Machine Learning

'checking_balance', 'credit_history', 'savings_balance', 'employment_duration', 'other_credit', 'housing'

In [19]:
SelectedColumns=['checking_balance','credit_history','savings_balance','employment_duration','other_credit',"housing",
 'age', 'amount', 'months_loan_duration']

# Selecting final columns
DataForML=credit_data[SelectedColumns]
DataForML.head()
Out[19]:
checking_balance credit_history savings_balance employment_duration other_credit housing age amount months_loan_duration
0 < 0 DM critical unknown > 7 years none own 67 1169 6
1 1 - 200 DM good < 100 DM 1 - 4 years none own 22 5951 48
2 unknown critical < 100 DM 4 - 7 years none own 49 2096 12
3 < 0 DM good < 100 DM 4 - 7 years none other 45 7882 42
4 < 0 DM poor < 100 DM 1 - 4 years none other 53 4870 24

Data Pre-processing for Machine Learning List of steps performed on predictor variables before data can be used for machine learning

Converting each Ordinal Categorical columns to numeric Converting Binary nominal Categorical columns to numeric using 1/0 mapping Converting all other nominal categorical columns to numeric using pd.get_dummies() Data Transformation (Optional): Standardization/Normalization/log/sqrt. Important if you are using distance based algorithms like KNN, or Neural Networks

Converting Ordinal variables to numeric using business mapping Based on the information on the column values explanation from the data website

https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

"employment_duration" column has ordinal properties.

In [20]:
# Treating the Ordinal variable first
DataForML['employment_duration'].unique()
Out[20]:
array(['> 7 years', '1 - 4 years', '4 - 7 years', 'unemployed',
       '< 1 year'], dtype=object)
In [21]:
DataForML['employment_duration'].replace({'unemployed':1, '< 1 year':2, '1 - 4 years':3,'4 - 7 years':4, '> 7 years':5 }, inplace=True)
DataForML.head()
/var/folders/kp/l85k4_0x6hjc43hc1dtz0yhm0000gn/T/ipykernel_23139/1556491946.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DataForML['employment_duration'].replace({'unemployed':1, '< 1 year':2, '1 - 4 years':3,'4 - 7 years':4, '> 7 years':5 }, inplace=True)
Out[21]:
checking_balance credit_history savings_balance employment_duration other_credit housing age amount months_loan_duration
0 < 0 DM critical unknown 5 none own 67 1169 6
1 1 - 200 DM good < 100 DM 3 none own 22 5951 48
2 unknown critical < 100 DM 4 none own 49 2096 12
3 < 0 DM good < 100 DM 4 none other 45 7882 42
4 < 0 DM poor < 100 DM 3 none other 53 4870 24

Converting nominal variables to numeric using get_dummies()

In [22]:
# Treating all the nominal variables at once using dummy variables
DataForML_Numeric=pd.get_dummies(DataForML).astype('int')

# Adding Target Variable to the data
DataForML_Numeric['default']=credit_data['default']
DataForML_Numeric.default.replace(('yes', 'no'), (1, 0), inplace=True)

# Printing sample rows
DataForML_Numeric.head()
Out[22]:
employment_duration age amount months_loan_duration checking_balance_1 - 200 DM checking_balance_< 0 DM checking_balance_> 200 DM checking_balance_unknown credit_history_critical credit_history_good ... savings_balance_< 100 DM savings_balance_> 1000 DM savings_balance_unknown other_credit_bank other_credit_none other_credit_store housing_other housing_own housing_rent default
0 5 67 1169 6 0 1 0 0 1 0 ... 0 0 1 0 1 0 0 1 0 0
1 3 22 5951 48 1 0 0 0 0 1 ... 1 0 0 0 1 0 0 1 0 1
2 4 49 2096 12 0 0 0 1 1 0 ... 1 0 0 0 1 0 0 1 0 0
3 4 45 7882 42 0 1 0 0 0 1 ... 1 0 0 0 1 0 1 0 0 0
4 3 53 4870 24 0 1 0 0 0 0 ... 1 0 0 0 1 0 1 0 0 1

5 rows × 25 columns

In [23]:
# Saving this final data for reference during deployment
DataForML.to_pickle('../data/DataForML.pkl')
DataForML.to_csv('../data/dDataForML.csv')

Machine Learning: Splitting the data into Training and Testing sample¶

In [24]:
from sklearn.model_selection import train_test_split
X = DataForML_Numeric.drop("default" ,axis= 1)
y = DataForML_Numeric['default']
Predictors = X.columns.values.tolist()
TargetVariable='default'
### Sandardization of data ###
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Choose either standardization or Normalization
# On this data Min Max Normalization produced better results

# Choose between standardization and MinMAx normalization
#PredictorScaler=StandardScaler()
PredictorScaler=MinMaxScaler()

# Storing the fit object for later reference
PredictorScalerFit=PredictorScaler.fit(X)

# Generating the standardized values of X
X=PredictorScalerFit.transform(X)

# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Logistic Regression¶

In [25]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# choose parameter Penalty='l2' or C=1
# choose different values for solver 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'
logistic_regression_model = LogisticRegression(C=1,penalty='l2', solver='newton-cg')
filename = '../model/logistic_regression_model.mdl'
pickle.dump(logistic_regression_model, open(filename, 'wb'))
# Printing all the parameters of Logistic Regression
print(logistic_regression_model)
# Creating the model on Training Data
LOGREG=logistic_regression_model.fit(X_train,y_train)
prediction=LOGREG.predict(X_test)
# Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
cnf_matrix = metrics.confusion_matrix(y_test, prediction)
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

# Running 10-Fold Cross validation on a given algorithmd
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(LOGREG, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
logistic_regression_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', logistic_regression_acc)

coefficients = logistic_regression_model.coef_[0]

feature_importance = pd.DataFrame({'Feature': Predictors, 'Importance': np.abs(coefficients)})
feature_importance = feature_importance.sort_values('Importance', ascending=True)
feature_importance.plot(x='Feature', y='Importance', kind='barh', figsize=(10, 6))
plt.savefig('../results/logistic_regression_model_feature_importances.png',bbox_inches = 'tight')
plt.show()
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig('../results/logistic_regression_model_cnf_matrix.png',bbox_inches = 'tight') 
plt.show()

y_pred_proba = LOGREG.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc=4)

plt.savefig('../results/logistic_regression_model_roc.png',bbox_inches = 'tight')
plt.show() 
LogisticRegression(C=1, solver='newton-cg')
              precision    recall  f1-score   support

           0       0.77      0.90      0.83       209
           1       0.62      0.38      0.48        91

    accuracy                           0.74       300
   macro avg       0.70      0.64      0.65       300
weighted avg       0.73      0.74      0.72       300

Accuracy of the model on Testing Sample Data: 0.72

Accuracy values for 10-fold Cross Validation:
 [0.744      0.70413016 0.73104474 0.73947368 0.72604645 0.7965035
 0.5875     0.73104474 0.75665219 0.78896502]

Final Average Accuracy of the model: 0.73

Decision Trees¶

In [26]:
#Decision Trees
from sklearn import tree
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics
# choose from different tunable hyper parameters
# Choose various values of max_depth and criterion for tuning the model
DTree_model = tree.DecisionTreeClassifier(max_depth=4,criterion='gini')

# Printing all the parameters of Random Forest
print(DTree_model)
# Saving Model
filename = '../model/dicision_trees_model.mdl'
pickle.dump(DTree_model, open(filename, 'wb'))
# Creating the model on Training Data
DTree=DTree_model.fit(X_train,y_train)
prediction=DTree.predict(X_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
cnf_matrix = metrics.confusion_matrix(y_test, prediction)
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

feature_importances = pd.Series(DTree.feature_importances_, index=Predictors)
feature_importances.nlargest(10).plot(kind='barh')
plt.savefig('../results/dicision_trees_model_feature_importances.png',bbox_inches = 'tight')


# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(DTree, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
dicision_trees_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', dicision_trees_acc)

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig('../results/dicision_trees_model_cnf_matrix.png',bbox_inches = 'tight')  
DecisionTreeClassifier(max_depth=4)
              precision    recall  f1-score   support

           0       0.73      0.92      0.82       209
           1       0.57      0.23      0.33        91

    accuracy                           0.71       300
   macro avg       0.65      0.58      0.57       300
weighted avg       0.68      0.71      0.67       300

Accuracy of the model on Testing Sample Data: 0.67

Accuracy values for 10-fold Cross Validation:
 [0.74604343 0.65866667 0.74       0.6444608  0.67569892 0.64715447
 0.70133333 0.72       0.70181818 0.68801189]

Final Average Accuracy of the model: 0.69

Plotting a Decision Tree

In [27]:
import graphviz
from sklearn import tree
# DOT data
dot_data = tree.export_graphviz(DTree_model, out_file=None, 
                                feature_names=Predictors,  
                                class_names=TargetVariable,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('../results/dicision_trees_model_dtree',format='png', view=False)
graph
# Double click on the graph to zoom in
Out[27]:
Tree 0 checking_balance_unknown <= 0.5 gini = 0.419 samples = 700 value = [491, 209] class = d 1 months_loan_duration <= 0.426 gini = 0.484 samples = 425 value = [251, 174] class = d 0->1 True 12 other_credit_none <= 0.5 gini = 0.222 samples = 275 value = [240, 35] class = d 0->12 False 2 amount <= 0.59 gini = 0.458 samples = 343 value = [221, 122] class = d 1->2 7 employment_duration <= 0.125 gini = 0.464 samples = 82 value = [30, 52] class = e 1->7 3 credit_history_perfect <= 0.5 gini = 0.451 samples = 337 value = [221, 116] class = d 2->3 6 gini = 0.0 samples = 6 value = [0, 6] class = e 2->6 4 gini = 0.437 samples = 316 value = [214, 102] class = d 3->4 5 gini = 0.444 samples = 21 value = [7, 14] class = e 3->5 8 gini = 0.0 samples = 7 value = [7, 0] class = d 7->8 9 savings_balance_unknown <= 0.5 gini = 0.425 samples = 75 value = [23, 52] class = e 7->9 10 gini = 0.375 samples = 64 value = [16, 48] class = e 9->10 11 gini = 0.463 samples = 11 value = [7, 4] class = d 9->11 13 employment_duration <= 0.125 gini = 0.423 samples = 46 value = [32, 14] class = d 12->13 20 age <= 0.08 gini = 0.167 samples = 229 value = [208, 21] class = d 12->20 14 age <= 0.196 gini = 0.375 samples = 4 value = [1, 3] class = e 13->14 17 age <= 0.455 gini = 0.387 samples = 42 value = [31, 11] class = d 13->17 15 gini = 0.0 samples = 1 value = [1, 0] class = d 14->15 16 gini = 0.0 samples = 3 value = [0, 3] class = e 14->16 18 gini = 0.444 samples = 33 value = [22, 11] class = d 17->18 19 gini = 0.0 samples = 9 value = [9, 0] class = d 17->19 21 savings_balance_< 100 DM <= 0.5 gini = 0.48 samples = 15 value = [9, 6] class = d 20->21 24 amount <= 0.421 gini = 0.13 samples = 214 value = [199, 15] class = d 20->24 22 gini = 0.0 samples = 6 value = [6, 0] class = d 21->22 23 gini = 0.444 samples = 9 value = [3, 6] class = e 21->23 25 gini = 0.102 samples = 204 value = [193, 11] class = d 24->25 26 gini = 0.48 samples = 10 value = [6, 4] class = d 24->26

Random Forest¶

In [28]:
# Random Forest (Bagging of multiple Decision Trees)
from sklearn.ensemble import RandomForestClassifier
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics
# Choose various values of max_depth, n_estimators and criterion for tuning the model
Random_Forest_model = RandomForestClassifier(max_depth=10, n_estimators=100,criterion='gini')


# Printing all the parameters of Random Forest
print(Random_Forest_model)
# Saving Model
filename = '../model/random_forest_model_model.mdl'
pickle.dump(Random_Forest_model, open(filename, 'wb'))
# Creating the model on Training Data
RF=Random_Forest_model.fit(X_train,y_train)
prediction=RF.predict(X_test)

# Measuring accuracy on Testing Data

print(metrics.classification_report(y_test, prediction))
cnf_matrix = metrics.confusion_matrix(y_test, prediction)
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(RF, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
random_forest_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', random_forest_acc)


# Plotting the feature importance for Top 10 most important columns
%matplotlib inline
feature_importances = pd.Series(RF.feature_importances_, index=Predictors)
feature_importances.nlargest(10).plot(kind='barh')
plt.savefig('../results/random_forest_model_feature_importances.png',bbox_inches = 'tight')

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig('../results/random_forest_model_cnf_matrix.png',bbox_inches = 'tight') 
plt.show() 
y_pred_proba = RF.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc=4)

plt.savefig('../results/random_forest_model_roc.png',bbox_inches = 'tight')
plt.show() 
RandomForestClassifier(max_depth=10)
              precision    recall  f1-score   support

           0       0.77      0.93      0.84       209
           1       0.68      0.35      0.46        91

    accuracy                           0.75       300
   macro avg       0.72      0.64      0.65       300
weighted avg       0.74      0.75      0.73       300

Accuracy of the model on Testing Sample Data: 0.73

Accuracy values for 10-fold Cross Validation:
 [0.81369863 0.72867133 0.73104474 0.75665219 0.71776316 0.74538879
 0.66029759 0.81369863 0.68       0.69317015]

Final Average Accuracy of the model: 0.73
In [29]:
import graphviz
from sklearn import tree
# DOT data
dot_data = tree.export_graphviz(Random_Forest_model.estimators_[4], out_file=None, 
                                feature_names=Predictors,  
                                class_names=TargetVariable,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('../results/random_forest_model_dtree',format='png', view=False)
graph
# Double click on the graph to zoom in
Out[29]:
Tree 0 months_loan_duration <= 0.368 gini = 0.43 samples = 455 value = [481, 219] class = d 1 other_credit_none <= 0.5 gini = 0.373 samples = 362 value = [416, 137] class = d 0->1 True 136 housing_own <= 0.5 gini = 0.493 samples = 93 value = [65, 82] class = e 0->136 False 2 credit_history_poor <= 0.5 gini = 0.479 samples = 61 value = [56, 37] class = d 1->2 33 credit_history_critical <= 0.5 gini = 0.34 samples = 301 value = [360, 100] class = d 1->33 3 savings_balance_500 - 1000 DM <= 0.5 gini = 0.488 samples = 56 value = [49, 36] class = d 2->3 28 months_loan_duration <= 0.243 gini = 0.219 samples = 5 value = [7, 1] class = d 2->28 4 savings_balance_> 1000 DM <= 0.5 gini = 0.495 samples = 52 value = [44, 36] class = d 3->4 27 gini = 0.0 samples = 4 value = [5, 0] class = d 3->27 5 other_credit_bank <= 0.5 gini = 0.5 samples = 48 value = [37, 36] class = d 4->5 26 gini = 0.0 samples = 4 value = [7, 0] class = d 4->26 6 checking_balance_< 0 DM <= 0.5 gini = 0.375 samples = 5 value = [2, 6] class = e 5->6 13 age <= 0.429 gini = 0.497 samples = 43 value = [35, 30] class = d 5->13 7 months_loan_duration <= 0.162 gini = 0.278 samples = 3 value = [1, 5] class = e 6->7 10 amount <= 0.199 gini = 0.5 samples = 2 value = [1, 1] class = d 6->10 8 gini = 0.0 samples = 2 value = [0, 5] class = e 7->8 9 gini = 0.0 samples = 1 value = [1, 0] class = d 7->9 11 gini = 0.0 samples = 1 value = [1, 0] class = d 10->11 12 gini = 0.0 samples = 1 value = [0, 1] class = e 10->12 14 savings_balance_100 - 500 DM <= 0.5 gini = 0.478 samples = 33 value = [19, 29] class = e 13->14 23 credit_history_very good <= 0.5 gini = 0.111 samples = 10 value = [16, 1] class = d 13->23 15 checking_balance_1 - 200 DM <= 0.5 gini = 0.466 samples = 32 value = [17, 29] class = e 14->15 22 gini = 0.0 samples = 1 value = [2, 0] class = d 14->22 16 savings_balance_unknown <= 0.5 gini = 0.444 samples = 21 value = [10, 20] class = e 15->16 19 housing_other <= 0.5 gini = 0.492 samples = 11 value = [7, 9] class = e 15->19 17 gini = 0.423 samples = 18 value = [7, 16] class = e 16->17 18 gini = 0.49 samples = 3 value = [3, 4] class = e 16->18 20 gini = 0.498 samples = 10 value = [7, 8] class = e 19->20 21 gini = 0.0 samples = 1 value = [0, 1] class = e 19->21 24 gini = 0.0 samples = 9 value = [16, 0] class = d 23->24 25 gini = 0.0 samples = 1 value = [0, 1] class = e 23->25 29 gini = 0.0 samples = 3 value = [5, 0] class = d 28->29 30 age <= 0.277 gini = 0.444 samples = 2 value = [2, 1] class = d 28->30 31 gini = 0.0 samples = 1 value = [2, 0] class = d 30->31 32 gini = 0.0 samples = 1 value = [0, 1] class = e 30->32 34 amount <= 0.132 gini = 0.43 samples = 202 value = [211, 96] class = d 33->34 109 checking_balance_unknown <= 0.5 gini = 0.051 samples = 99 value = [149, 4] class = d 33->109 35 checking_balance_unknown <= 0.5 gini = 0.469 samples = 136 value = [125, 75] class = d 34->35 82 credit_history_very good <= 0.5 gini = 0.315 samples = 66 value = [86, 21] class = d 34->82 36 checking_balance_< 0 DM <= 0.5 gini = 0.495 samples = 93 value = [75, 62] class = d 35->36 69 age <= 0.08 gini = 0.328 samples = 43 value = [50, 13] class = d 35->69 37 savings_balance_> 1000 DM <= 0.5 gini = 0.438 samples = 48 value = [46, 22] class = d 36->37 52 amount <= 0.06 gini = 0.487 samples = 45 value = [29, 40] class = e 36->52 38 housing_rent <= 0.5 gini = 0.461 samples = 45 value = [39, 22] class = d 37->38 51 gini = 0.0 samples = 3 value = [7, 0] class = d 37->51 39 employment_duration <= 0.875 gini = 0.418 samples = 38 value = [33, 14] class = d 38->39 46 amount <= 0.092 gini = 0.49 samples = 7 value = [6, 8] class = e 38->46 40 credit_history_good <= 0.5 gini = 0.461 samples = 31 value = [23, 13] class = d 39->40 43 checking_balance_1 - 200 DM <= 0.5 gini = 0.165 samples = 7 value = [10, 1] class = d 39->43 41 gini = 0.32 samples = 4 value = [1, 4] class = e 40->41 42 gini = 0.412 samples = 27 value = [22, 9] class = d 40->42 44 gini = 0.0 samples = 2 value = [4, 0] class = d 43->44 45 gini = 0.245 samples = 5 value = [6, 1] class = d 43->45 47 gini = 0.0 samples = 3 value = [4, 0] class = d 46->47 48 months_loan_duration <= 0.14 gini = 0.32 samples = 4 value = [2, 8] class = e 46->48 49 gini = 0.0 samples = 1 value = [1, 0] class = d 48->49 50 gini = 0.198 samples = 3 value = [1, 8] class = e 48->50 53 housing_other <= 0.5 gini = 0.368 samples = 24 value = [9, 28] class = e 52->53 60 age <= 0.759 gini = 0.469 samples = 21 value = [20, 12] class = d 52->60 54 credit_history_good <= 0.5 gini = 0.382 samples = 22 value = [9, 26] class = e 53->54 59 gini = 0.0 samples = 2 value = [0, 2] class = e 53->59 55 gini = 0.0 samples = 1 value = [0, 1] class = e 54->55 56 amount <= 0.052 gini = 0.389 samples = 21 value = [9, 25] class = e 54->56 57 gini = 0.469 samples = 15 value = [9, 15] class = e 56->57 58 gini = 0.0 samples = 6 value = [0, 10] class = e 56->58 61 employment_duration <= 0.375 gini = 0.444 samples = 20 value = [20, 10] class = d 60->61 68 gini = 0.0 samples = 1 value = [0, 2] class = e 60->68 62 savings_balance_500 - 1000 DM <= 0.5 gini = 0.5 samples = 9 value = [7, 7] class = d 61->62 65 savings_balance_unknown <= 0.5 gini = 0.305 samples = 11 value = [13, 3] class = d 61->65 63 gini = 0.497 samples = 8 value = [6, 7] class = e 62->63 64 gini = 0.0 samples = 1 value = [1, 0] class = d 62->64 66 gini = 0.133 samples = 9 value = [13, 1] class = d 65->66 67 gini = 0.0 samples = 2 value = [0, 2] class = e 65->67 70 gini = 0.0 samples = 3 value = [0, 6] class = e 69->70 71 savings_balance_> 1000 DM <= 0.5 gini = 0.215 samples = 40 value = [50, 7] class = d 69->71 72 housing_other <= 0.5 gini = 0.168 samples = 38 value = [49, 5] class = d 71->72 79 months_loan_duration <= 0.11 gini = 0.444 samples = 2 value = [1, 2] class = e 71->79 73 months_loan_duration <= 0.228 gini = 0.171 samples = 37 value = [48, 5] class = d 72->73 78 gini = 0.0 samples = 1 value = [1, 0] class = d 72->78 74 age <= 0.223 gini = 0.214 samples = 30 value = [36, 5] class = d 73->74 77 gini = 0.0 samples = 7 value = [12, 0] class = d 73->77 75 gini = 0.444 samples = 13 value = [10, 5] class = d 74->75 76 gini = 0.0 samples = 17 value = [26, 0] class = d 74->76 80 gini = 0.0 samples = 1 value = [1, 0] class = d 79->80 81 gini = 0.0 samples = 1 value = [0, 2] class = e 79->81 83 housing_own <= 0.5 gini = 0.306 samples = 65 value = [86, 20] class = d 82->83 108 gini = 0.0 samples = 1 value = [0, 1] class = e 82->108 84 amount <= 0.353 gini = 0.367 samples = 23 value = [25, 8] class = d 83->84 99 credit_history_poor <= 0.5 gini = 0.275 samples = 42 value = [61, 12] class = d 83->99 85 checking_balance_unknown <= 0.5 gini = 0.26 samples = 17 value = [22, 4] class = d 84->85 92 checking_balance_< 0 DM <= 0.5 gini = 0.49 samples = 6 value = [3, 4] class = e 84->92 86 checking_balance_> 200 DM <= 0.5 gini = 0.332 samples = 13 value = [15, 4] class = d 85->86 91 gini = 0.0 samples = 4 value = [7, 0] class = d 85->91 87 credit_history_perfect <= 0.5 gini = 0.36 samples = 12 value = [13, 4] class = d 86->87 90 gini = 0.0 samples = 1 value = [2, 0] class = d 86->90 88 gini = 0.305 samples = 11 value = [13, 3] class = d 87->88 89 gini = 0.0 samples = 1 value = [0, 1] class = e 87->89 93 credit_history_poor <= 0.5 gini = 0.444 samples = 5 value = [2, 4] class = e 92->93 98 gini = 0.0 samples = 1 value = [1, 0] class = d 92->98 94 housing_rent <= 0.5 gini = 0.32 samples = 4 value = [1, 4] class = e 93->94 97 gini = 0.0 samples = 1 value = [1, 0] class = d 93->97 95 gini = 0.0 samples = 2 value = [0, 2] class = e 94->95 96 gini = 0.444 samples = 2 value = [1, 2] class = e 94->96 100 employment_duration <= 0.875 gini = 0.298 samples = 36 value = [54, 12] class = d 99->100 107 gini = 0.0 samples = 6 value = [7, 0] class = d 99->107 101 checking_balance_> 200 DM <= 0.5 gini = 0.37 samples = 28 value = [37, 12] class = d 100->101 106 gini = 0.0 samples = 8 value = [17, 0] class = d 100->106 102 credit_history_good <= 0.5 gini = 0.375 samples = 27 value = [36, 12] class = d 101->102 105 gini = 0.0 samples = 1 value = [1, 0] class = d 101->105 103 gini = 0.0 samples = 1 value = [0, 1] class = e 102->103 104 gini = 0.359 samples = 26 value = [36, 11] class = d 102->104 110 employment_duration <= 0.875 gini = 0.083 samples = 41 value = [66, 3] class = d 109->110 133 amount <= 0.523 gini = 0.024 samples = 58 value = [83, 1] class = d 109->133 111 savings_balance_< 100 DM <= 0.5 gini = 0.045 samples = 26 value = [42, 1] class = d 110->111 122 months_loan_duration <= 0.14 gini = 0.142 samples = 15 value = [24, 2] class = d 110->122 112 gini = 0.0 samples = 8 value = [13, 0] class = d 111->112 113 housing_own <= 0.5 gini = 0.064 samples = 18 value = [29, 1] class = d 111->113 114 gini = 0.0 samples = 7 value = [13, 0] class = d 113->114 115 months_loan_duration <= 0.103 gini = 0.111 samples = 11 value = [16, 1] class = d 113->115 116 gini = 0.0 samples = 5 value = [7, 0] class = d 115->116 117 amount <= 0.141 gini = 0.18 samples = 6 value = [9, 1] class = d 115->117 118 gini = 0.0 samples = 4 value = [7, 0] class = d 117->118 119 checking_balance_< 0 DM <= 0.5 gini = 0.444 samples = 2 value = [2, 1] class = d 117->119 120 gini = 0.0 samples = 1 value = [2, 0] class = d 119->120 121 gini = 0.0 samples = 1 value = [0, 1] class = e 119->121 123 amount <= 0.029 gini = 0.245 samples = 9 value = [12, 2] class = d 122->123 132 gini = 0.0 samples = 6 value = [12, 0] class = d 122->132 124 gini = 0.0 samples = 1 value = [0, 1] class = e 123->124 125 checking_balance_1 - 200 DM <= 0.5 gini = 0.142 samples = 8 value = [12, 1] class = d 123->125 126 gini = 0.0 samples = 5 value = [9, 0] class = d 125->126 127 housing_own <= 0.5 gini = 0.375 samples = 3 value = [3, 1] class = d 125->127 128 gini = 0.0 samples = 1 value = [2, 0] class = d 127->128 129 age <= 0.295 gini = 0.5 samples = 2 value = [1, 1] class = d 127->129 130 gini = 0.0 samples = 1 value = [0, 1] class = e 129->130 131 gini = 0.0 samples = 1 value = [1, 0] class = d 129->131 134 gini = 0.0 samples = 57 value = [83, 0] class = d 133->134 135 gini = 0.0 samples = 1 value = [0, 1] class = e 133->135 137 employment_duration <= 0.125 gini = 0.332 samples = 28 value = [8, 30] class = e 136->137 162 checking_balance_< 0 DM <= 0.5 gini = 0.499 samples = 65 value = [57, 52] class = d 136->162 138 gini = 0.0 samples = 2 value = [2, 0] class = d 137->138 139 credit_history_poor <= 0.5 gini = 0.278 samples = 26 value = [6, 30] class = e 137->139 140 checking_balance_> 200 DM <= 0.5 gini = 0.225 samples = 22 value = [4, 27] class = e 139->140 155 savings_balance_100 - 500 DM <= 0.5 gini = 0.48 samples = 4 value = [2, 3] class = e 139->155 141 savings_balance_< 100 DM <= 0.5 gini = 0.18 samples = 21 value = [3, 27] class = e 140->141 154 gini = 0.0 samples = 1 value = [1, 0] class = d 140->154 142 gini = 0.0 samples = 9 value = [0, 15] class = e 141->142 143 other_credit_none <= 0.5 gini = 0.32 samples = 12 value = [3, 12] class = e 141->143 144 amount <= 0.368 gini = 0.48 samples = 3 value = [2, 3] class = e 143->144 147 housing_other <= 0.5 gini = 0.18 samples = 9 value = [1, 9] class = e 143->147 145 gini = 0.0 samples = 1 value = [2, 0] class = d 144->145 146 gini = 0.0 samples = 2 value = [0, 3] class = e 144->146 148 gini = 0.0 samples = 4 value = [0, 5] class = e 147->148 149 checking_balance_1 - 200 DM <= 0.5 gini = 0.32 samples = 5 value = [1, 4] class = e 147->149 150 amount <= 0.242 gini = 0.444 samples = 3 value = [1, 2] class = e 149->150 153 gini = 0.0 samples = 2 value = [0, 2] class = e 149->153 151 gini = 0.0 samples = 1 value = [0, 1] class = e 150->151 152 gini = 0.5 samples = 2 value = [1, 1] class = d 150->152 156 employment_duration <= 0.375 gini = 0.375 samples = 3 value = [1, 3] class = e 155->156 161 gini = 0.0 samples = 1 value = [1, 0] class = d 155->161 157 gini = 0.0 samples = 1 value = [0, 2] class = e 156->157 158 amount <= 0.409 gini = 0.5 samples = 2 value = [1, 1] class = d 156->158 159 gini = 0.0 samples = 1 value = [0, 1] class = e 158->159 160 gini = 0.0 samples = 1 value = [1, 0] class = d 158->160 163 other_credit_none <= 0.5 gini = 0.476 samples = 47 value = [47, 30] class = d 162->163 192 months_loan_duration <= 0.426 gini = 0.43 samples = 18 value = [10, 22] class = e 162->192 164 other_credit_bank <= 0.5 gini = 0.375 samples = 8 value = [3, 9] class = e 163->164 173 checking_balance_1 - 200 DM <= 0.5 gini = 0.437 samples = 39 value = [44, 21] class = d 163->173 165 age <= 0.125 gini = 0.5 samples = 4 value = [3, 3] class = d 164->165 172 gini = 0.0 samples = 4 value = [0, 6] class = e 164->172 166 gini = 0.0 samples = 1 value = [0, 2] class = e 165->166 167 months_loan_duration <= 0.426 gini = 0.375 samples = 3 value = [3, 1] class = d 165->167 168 gini = 0.0 samples = 1 value = [2, 0] class = d 167->168 169 checking_balance_1 - 200 DM <= 0.5 gini = 0.5 samples = 2 value = [1, 1] class = d 167->169 170 gini = 0.0 samples = 1 value = [1, 0] class = d 169->170 171 gini = 0.0 samples = 1 value = [0, 1] class = e 169->171 174 employment_duration <= 0.375 gini = 0.149 samples = 23 value = [34, 3] class = d 173->174 177 savings_balance_unknown <= 0.5 gini = 0.459 samples = 16 value = [10, 18] class = e 173->177 175 gini = 0.0 samples = 2 value = [0, 3] class = e 174->175 176 gini = 0.0 samples = 21 value = [34, 0] class = d 174->176 178 amount <= 0.243 gini = 0.426 samples = 14 value = [8, 18] class = e 177->178 191 gini = 0.0 samples = 2 value = [2, 0] class = d 177->191 179 credit_history_critical <= 0.5 gini = 0.48 samples = 6 value = [6, 4] class = d 178->179 186 amount <= 0.439 gini = 0.219 samples = 8 value = [2, 14] class = e 178->186 180 months_loan_duration <= 0.559 gini = 0.444 samples = 4 value = [4, 2] class = d 179->180 183 age <= 0.152 gini = 0.5 samples = 2 value = [2, 2] class = d 179->183 181 gini = 0.0 samples = 3 value = [4, 0] class = d 180->181 182 gini = 0.0 samples = 1 value = [0, 2] class = e 180->182 184 gini = 0.0 samples = 1 value = [2, 0] class = d 183->184 185 gini = 0.0 samples = 1 value = [0, 2] class = e 183->185 187 gini = 0.0 samples = 4 value = [0, 9] class = e 186->187 188 savings_balance_500 - 1000 DM <= 0.5 gini = 0.408 samples = 4 value = [2, 5] class = e 186->188 189 gini = 0.0 samples = 3 value = [0, 5] class = e 188->189 190 gini = 0.0 samples = 1 value = [2, 0] class = d 188->190 193 savings_balance_< 100 DM <= 0.5 gini = 0.32 samples = 3 value = [4, 1] class = d 192->193 198 credit_history_critical <= 0.5 gini = 0.346 samples = 15 value = [6, 21] class = e 192->198 194 gini = 0.0 samples = 1 value = [2, 0] class = d 193->194 195 age <= 0.25 gini = 0.444 samples = 2 value = [2, 1] class = d 193->195 196 gini = 0.0 samples = 1 value = [2, 0] class = d 195->196 197 gini = 0.0 samples = 1 value = [0, 1] class = e 195->197 199 gini = 0.0 samples = 10 value = [0, 16] class = e 198->199 200 age <= 0.152 gini = 0.496 samples = 5 value = [6, 5] class = d 198->200 201 gini = 0.0 samples = 2 value = [0, 5] class = e 200->201 202 gini = 0.0 samples = 3 value = [6, 0] class = d 200->202

AdaBoost¶

In [30]:
# Adaboost 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics

# Choosing Decision Tree with 1 level as the weak learner
# Choose different values of max_depth, n_estimators and learning_rate to tune the model
DTC=DecisionTreeClassifier(max_depth=4)
Ada_Boost_model = AdaBoostClassifier(n_estimators=200, estimator=DTC ,learning_rate=0.01)

# Printing all the parameters of Adaboost
print(Ada_Boost_model)
# Saving Model
filename = '../model/ada_boost_model_model.mdl'
pickle.dump(Ada_Boost_model, open(filename, 'wb'))
# Creating the model on Training Data
AB=Ada_Boost_model.fit(X_train,y_train)
prediction=AB.predict(X_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
cnf_matrix = metrics.confusion_matrix(y_test, prediction)
# Printing the Overall Accuracy of the model
F1_Score_ada_boost=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))


# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(AB, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
ada_boost_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', ada_boost_acc)

# Plotting the feature importance for Top 10 most important columns
%matplotlib inline
feature_importances = pd.Series(AB.feature_importances_, index=Predictors)
feature_importances.nlargest(10).plot(kind='barh')
plt.savefig('../results/ada_boost_model_feature_importances.png',bbox_inches = 'tight')

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label') 
plt.savefig('../results/ada_boost_model_cnf_matrix.png',bbox_inches = 'tight') 
plt.show()
y_pred_proba = AB.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc=4)

plt.savefig('../results/ada_boost_model_roc.png',bbox_inches = 'tight')
plt.show() 
AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=4),
                   learning_rate=0.01, n_estimators=200)
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       209
           1       0.66      0.42      0.51        91

    accuracy                           0.76       300
   macro avg       0.72      0.66      0.67       300
weighted avg       0.74      0.76      0.74       300

Accuracy of the model on Testing Sample Data: 0.73

Accuracy values for 10-fold Cross Validation:
 [0.76351097 0.69806452 0.74604343 0.72604645 0.70133333 0.80460526
 0.72266667 0.75256116 0.68221388 0.68344988]

Final Average Accuracy of the model: 0.73
In [31]:
import graphviz
from sklearn import tree
# DOT data
dot_data = tree.export_graphviz(Ada_Boost_model.estimators_[4], out_file=None, 
                                feature_names=Predictors,  
                                class_names=TargetVariable,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render('../results/ada_boost_model_dtree',format='png', view=False)
graph
# Double click on the graph to zoom in
Out[31]:
Tree 0 checking_balance_unknown <= 0.5 gini = 0.426 samples = 700 value = [0.693, 0.307] class = d 1 months_loan_duration <= 0.426 gini = 0.485 samples = 425 value = [0.364, 0.256] class = d 0->1 True 14 age <= 0.08 gini = 0.232 samples = 275 value = [0.329, 0.051] class = d 0->14 False 2 age <= 0.366 gini = 0.459 samples = 343 value = [0.324, 0.18] class = d 1->2 9 employment_duration <= 0.125 gini = 0.451 samples = 82 value = [0.04, 0.076] class = e 1->9 3 amount <= 0.062 gini = 0.483 samples = 252 value = [0.221, 0.151] class = d 2->3 6 amount <= 0.418 gini = 0.34 samples = 91 value = [0.103, 0.029] class = d 2->6 4 gini = 0.493 samples = 81 value = [0.053, 0.067] class = e 3->4 5 gini = 0.445 samples = 171 value = [0.167, 0.084] class = d 3->5 7 gini = 0.301 samples = 87 value = [0.103, 0.023] class = d 6->7 8 gini = -0.0 samples = 4 value = [0.0, 0.005] class = e 6->8 10 gini = 0.0 samples = 7 value = [0.005, 0.0] class = d 9->10 11 savings_balance_unknown <= 0.5 gini = 0.43 samples = 75 value = [0.035, 0.076] class = e 9->11 12 gini = 0.383 samples = 64 value = [0.024, 0.07] class = e 11->12 13 gini = 0.466 samples = 11 value = [0.01, 0.006] class = d 11->13 15 savings_balance_< 100 DM <= 0.5 gini = 0.5 samples = 15 value = [0.009, 0.009] class = d 14->15 20 other_credit_none <= 0.5 gini = 0.206 samples = 260 value = [0.32, 0.042] class = d 14->20 16 gini = 0.0 samples = 6 value = [0.004, 0.0] class = d 15->16 17 amount <= 0.1 gini = 0.448 samples = 9 value = [0.005, 0.009] class = e 15->17 18 gini = 0.0 samples = 4 value = [0.0, 0.006] class = e 17->18 19 gini = 0.477 samples = 5 value = [0.005, 0.003] class = d 17->19 21 age <= 0.125 gini = 0.437 samples = 46 value = [0.04, 0.019] class = d 20->21 24 amount <= 0.421 gini = 0.142 samples = 214 value = [0.281, 0.023] class = d 20->24 22 gini = 0.44 samples = 3 value = [0.001, 0.003] class = e 21->22 23 gini = 0.414 samples = 43 value = [0.038, 0.016] class = d 21->23 25 gini = 0.113 samples = 204 value = [0.272, 0.017] class = d 24->25 26 gini = 0.482 samples = 10 value = [0.009, 0.006] class = d 24->26

XGBoost¶

In [32]:
# Xtreme Gradient Boosting (XGBoost)
from xgboost import XGBClassifier
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics

XGB_model=XGBClassifier(max_depth=10, learning_rate=0.01, n_estimators=200, objective='binary:logistic', booster='gbtree')

# Printing all the parameters of XGBoost
print(XGB_model)
# Saving Model
filename = '../model/xgb_model_model_model.mdl'
pickle.dump(XGB_model, open(filename, 'wb'))
# Creating the model on Training Data
XGB=XGB_model.fit(X_train,y_train)
prediction=XGB.predict(X_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
cnf_matrix = metrics.confusion_matrix(y_test, prediction)
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))


# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(XGB, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
xgb_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', xgb_acc)

# Plotting the feature importance for Top 10 most important columns
%matplotlib inline
feature_importances = pd.Series(XGB.feature_importances_, index=Predictors)
feature_importances.nlargest(10).plot(kind='barh')
plt.savefig('../results/xgb_model_feature_importances.png',bbox_inches = 'tight')


class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig('../results/xgb_model_cnf_matrix.png',bbox_inches = 'tight') 
plt.show() 
y_pred_proba = XGB.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc=4)

plt.savefig('../results/xgb_model_roc.png',bbox_inches = 'tight')
plt.show() 
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
              precision    recall  f1-score   support

           0       0.77      0.90      0.83       209
           1       0.62      0.37      0.47        91

    accuracy                           0.74       300
   macro avg       0.69      0.64      0.65       300
weighted avg       0.72      0.74      0.72       300

Accuracy of the model on Testing Sample Data: 0.72

Accuracy values for 10-fold Cross Validation:
 [0.78896502 0.70133333 0.76886645 0.70602911 0.71388889 0.78289474
 0.744      0.78667648 0.70602911 0.72266667]

Final Average Accuracy of the model: 0.74
In [33]:
# max_depth=10 is too large to be plot here

from xgboost import plot_tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(100, 40))
plot_tree(XGB, num_trees=10, ax=ax)
plt.savefig('../results/xgb_boost_model_dtree.png',bbox_inches = 'tight')
plt.show()
# Double click on the graph to zoom in

KNN¶

In [34]:
from sklearn.neighbors import KNeighborsClassifier
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics
KNN_model = KNeighborsClassifier(n_neighbors=3)

# Printing all the parameters of KNN
print(KNN_model)
# Saving Model
filename = '../model/kneighbors_model_model_model.mdl'
pickle.dump(KNN_model, open(filename, 'wb'))
# Creating the model on Training Data
KNN=KNN_model.fit(X_train,y_train)
prediction=KNN.predict(X_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
cnf_matrix=metrics.confusion_matrix(y_test, prediction) 
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))


# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(KNN, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
kneighbors_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', kneighbors_acc)

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig('../results/kneighbors_model_cnf_matrix.png',bbox_inches = 'tight') 
plt.show() 
y_pred_proba = KNN.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc=4)

plt.savefig('../results/kneighbors_model_roc.png',bbox_inches = 'tight')
plt.show() 
# Plotting the feature importance for Top 10 most important columns
# There is no built-in method to get feature importance in KNN
KNeighborsClassifier(n_neighbors=3)
              precision    recall  f1-score   support

           0       0.78      0.85      0.82       209
           1       0.58      0.46      0.51        91

    accuracy                           0.73       300
   macro avg       0.68      0.66      0.66       300
weighted avg       0.72      0.73      0.72       300

Accuracy of the model on Testing Sample Data: 0.72

Accuracy values for 10-fold Cross Validation:
 [0.75665219 0.67434211 0.68949772 0.71714424 0.71714424 0.75665219
 0.59466667 0.70952831 0.68503119 0.67673628]

Final Average Accuracy of the model: 0.7

SVM¶

In [35]:
# Support Vector Machines(SVM)
from sklearn import svm
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics

svm_model= svm.SVC(C=2, kernel='rbf', gamma=0.1,probability=True)

# Printing all the parameters of KNN
print(svm_model)
# Saving Model
filename = '../model/svm_model_model_model.mdl'
pickle.dump(svm_model, open(filename, 'wb'))
# Creating the model on Training Data
SVM=svm_model.fit(X_train,y_train)
prediction=SVM.predict(X_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
cnf_matrix = metrics.confusion_matrix(y_test, prediction)
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))


# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(SVM, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
svm_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', svm_acc)

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig('../results/svm_model_cnf_matrix.png',bbox_inches = 'tight') 
plt.show() 
y_pred_proba = SVM.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc=4)

plt.savefig('../results/svm_model_roc.png',bbox_inches = 'tight')
plt.show() 
# Plotting the feature importance for Top 10 most important columns
# The built in attribute SVM.coef_ works only for linear kernel
SVC(C=2, gamma=0.1, probability=True)
              precision    recall  f1-score   support

           0       0.78      0.87      0.83       209
           1       0.60      0.45      0.52        91

    accuracy                           0.74       300
   macro avg       0.69      0.66      0.67       300
weighted avg       0.73      0.74      0.73       300

Accuracy of the model on Testing Sample Data: 0.73

Accuracy values for 10-fold Cross Validation:
 [0.73549151 0.74796273 0.68949772 0.71776316 0.73947368 0.75388863
 0.64497905 0.74796273 0.69909584 0.7511499 ]

Final Average Accuracy of the model: 0.72

Naive Bayes¶

In [36]:
# Naive Bays
from sklearn.naive_bayes import GaussianNB, MultinomialNB
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
from sklearn import metrics

# GaussianNB is used in Binomial Classification
# MultinomialNB is used in multi-class classification
nb_model = GaussianNB()

# Printing all the parameters of Naive Bayes
print(nb_model)
# Saving Model
filename = '../model/naive_bayes_model_model.mdl'
pickle.dump(nb_model, open(filename, 'wb'))

NB=nb_model.fit(X_train,y_train)
prediction=NB.predict(X_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(y_test, prediction))
cnf_matrix = metrics.confusion_matrix(y_test, prediction)
# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(NB, X , y, cv=10, scoring='f1_weighted')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
naive_bayes_acc =  round(Accuracy_Values.mean(),2)
print('\nFinal Average Accuracy of the model:', naive_bayes_acc)

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.savefig('../results/naive_bayes_model_cnf_matrix.png',bbox_inches = 'tight') 
plt.show() 
y_pred_proba = NB.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc=4)

plt.savefig('../results/naive_bayes_model_roc.png',bbox_inches = 'tight')
plt.show() 
GaussianNB()
              precision    recall  f1-score   support

           0       0.82      0.77      0.79       209
           1       0.54      0.63      0.58        91

    accuracy                           0.72       300
   macro avg       0.68      0.70      0.69       300
weighted avg       0.74      0.72      0.73       300

Accuracy of the model on Testing Sample Data: 0.73

Accuracy values for 10-fold Cross Validation:
 [0.74599729 0.67673628 0.70496324 0.71578348 0.71133388 0.81607143
 0.64319776 0.70857074 0.72798574 0.75498575]

Final Average Accuracy of the model: 0.72

Comprasion Between Models¶

In [37]:
# Classifier names and their corresponding accuracy scores
classifiers = ['Logistic Regression', 'Decision Trees', 'Random Forest', 
               'AdaBoost', 'XGBoost', 'KNeighbors', 'SVM', 'Naive Bayes']
accuracy_scores = [logistic_regression_acc, dicision_trees_acc, random_forest_acc, ada_boost_acc, xgb_acc, kneighbors_acc,svm_acc, naive_bayes_acc]

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(classifiers, accuracy_scores, color='blue')
plt.xlabel('Classifier')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores of Different Classifiers')
plt.ylim(0.6, 0.8)  # Set y-axis limits for better visualization
plt.xticks(rotation=45)
plt.savefig('../results/final_model_acc_compare.png',bbox_inches = 'tight') 
plt.tight_layout()

# Display the chart
plt.show()
In [ ]: